dtable(ss)2 Expression data analysis
2.1 Sample sheet
The correspondence between the methylome and transcriptome names are:
- vector with FULL protein (without doxy): "Tat" (RNAseq [Tat_1, Tat_2, Tat_3]) = "TAT_NO" (microarrays [HC_SEN 3,9,14])
- vector with PARTIAL protein (without doxy):"Tat72" (RNAseq [Tat72_1, Tat72_2, Tat72_3]) = "Exon1_No" (microarrays [HC_SEN 5,11,16])
- vector with FULL protein (with doxy): "TatDOX" (RNAseq [TatDOX_1, TatDOX_2, TatDOX_3]) = "TAT_Dox" (microarrays [HC_SEN 4,10,15])
- vector with PARTIAL protein (with doxy): "Tat72DOX" (RNAseq [Tat72DOX_1, Tat72DOX_2, Tat72DOX_3]) = "Exon1_Dox" (microarrays [HC_SEN 6,12])
- vector without protein (empty): "TetOFF" (RNAseq [TetOFF_1, TetOFF_2, TetOFF_3]) = "Control_NO" (microarrays [HC_SEN 1,7,13])
- vector without protein (empty) + doxy: "TetOFFDOX" (RNAseq [TetOFFDOX_1, TetOFFDOX_2, TetOFF_3]) = "Control_Dox" (microarrays [HC_SEN 2,8])
2.2 Expression:
2.2.1 Counnts:
namedlist <- function(...) {
nl <- list(...)
argnames <- sys.call()
n<-sapply(argnames[-1], as.character)
names(nl)<-n
return(nl)
}
library(readxl)
library(data.table)
counts<-list()
Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat72_off) <- c("gene",sapply(colnames(Tat72_off),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat72_off)
Tat72_off[,ID :=tstrsplit(gene,"\\.",keep=1)]
Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat_Tat72) <- c("gene",sapply(colnames(Tat_Tat72),function(x)unlist(strsplit(x,"\\."))[7])[-1])
setDT(Tat_Tat72)
Tat_Tat72[,ID :=tstrsplit(gene,"\\.",keep=1)]
Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 1)New names:
• `` -> `...1`
colnames(Tat_off) <- c("gene",sapply(colnames(Tat_off),function(x)unlist(strsplit(basename(x),"\\."))[1])[-1])
setDT(Tat_off)
Tat_off[,ID :=tstrsplit(gene,"\\.",keep=1)]
counts <-namedlist(Tat72_off,Tat_Tat72,Tat_off)dtable(counts[[1]])Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
2.3 DEGs:
library(readxl)
library(data.table)
Tat72_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/Tat72VSTetOFF.xlsx", sheet = 3)
Tat72_off$Contrast="Tat72_off"
Tat_Tat72 <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTat72.xlsx", sheet = 3)
Tat_Tat72$Contrast="Tat_Tat72"
Tat_off <- readxl::read_xlsx("data-raw/Transcriptome_HClinic_Vcasanova_20042023/TatVSTetOFF.xlsx", sheet = 3)
Tat_off$Contrast="Tat_off"dt_DEGs <- data.table::data.table(rbind(Tat_Tat72,Tat72_off,Tat_off))
# dt_DEGs <- dt_DEGs[RealFC<0.5 | RealFC >2,]dtable(dt_DEGs)Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
2.4 Methylation:
“El primer objetivo es ver si la proteína TAT del VIH, induce cambios en el metiloma. La proteína TAT está codificada por 2 exones que realizan funciones diferentes y queremos además aislar el efecto de cada uno, por eso hemos generado unas líneas celulares de jurkat establemente transfectadas con:
- Un vector vacío que será nuestro control: muestras 1, 7 y 13 (triplicados)
- Un vector con la proteína TAT completa: muestras 3, 9 y 14 (triplicados)
- Un vector con solo el primer exón de TAT: muestras 5, 11 y 16 (triplicados)
La comparativa entre estos tres grupos es la primera que nos interesa. a-b, a-c y b-c.”
Por otro lado, la DOXYCICLINA “apaga” la expresión de TAT y el segundo objetivo entonces es ver si el “apagado” de TAT con la DOXY hace algún cambio en el metiloma. Por eso la comparativa aquí sería cada una de las tres líneas anteriores con y sin DOXY, así que tendremos además las siguientes muestras:
- Un vector vacío que será nuestro control + DOXY: muestras 2 y 8 (duplicados)
- Un vector con con la proteína TAT completa + DOXY: muestras 4, 10 y 15 (triplicados)
- Un vector con solo el primer exón de TAT + DOXY: muestras 6 y 12 (duplicados)
La segunda comparativa que nos interesa, por tanto, es. a-d, b-e y c-f.
While the first objective yielded positive results no difference was found for the second (Doxi) apporach.
2.4.1 PCA plot:
PCR plots:
2.4.2 DMRS:
dmrs<-readRDS("data/dmrs.rds")
ano_cols<-c("seqnames","start","end","width","no.cpgs","HMFDR","meandiff","Contrast")
dt_list<- lapply(
1:NROW(dmrs),function(rn){ # For every position or ProbeID do:
UCSC <- data.table( # 1. generate a data.table (per row)
dmrs[rn, # 2. split UCSC gene and position
lapply( # 3. new row for every combination of gene/position
overlapping.genes, # 4. add Contrast porbeID and bval difference
function(x)unlist(strsplit(x,","))
)
,by=ano_cols]
)
})
dt_dmrs <- rbindlist(dt_list)
DMRs <- dt_dmrs[,Gene:=V1][!is.na(V1),]
DMRs$V1 <- NULL
# c1<-DMRs[Contrast== "Control_No-Exon1_No",.(.SD,meandiff=-meandiff,Contrast="Tat72_off"),.SDcols=ano_cols[!ano_cols %in% c("meandiff","Contrast")]]
c1<-DMRs[Contrast== "Control_No-Exon1_No",]
c1$Contrast <- "Tat72_off"
c1$meandiff <- -c1$meandiff
c2<-DMRs[Contrast== "Control_No-TAT_No",]
c2$Contrast <- "Tat_off"
c2$meandiff <- -c2$meandiff
c3<-DMRs[Contrast== "Exon1_No-TAT_No",]
c3$Contrast <- "Tat_Tat72"
c3$meandiff <- -c3$meandiff
DMRs<-rbind(c1,c2,c3)
saveRDS(DMRs,"data/DMRs.rds")dtable(DMRs)2.5 DMRs vs expression
dt_DEG_DMR <- merge(DMRs,dt_DEGs,by=c("Gene","Contrast"),all = T)
tab <- dt_DEG_DMR[,.(DEGs=sum(!is.na(ID)),DMRs=sum(!is.na(start)),overlap=sum(!(is.na(start)) &!(is.na(ID)) )),by=c("Contrast")]
kableExtra::kbl(tab)|>kableExtra::kable_classic_2()| Contrast | DEGs | DMRs | overlap |
|---|---|---|---|
| Tat_Tat72 | 3087 | 1092 | 224 |
| Tat_off | 3102 | 990 | 207 |
| Tat72_off | 1594 | 53 | 8 |
2.5.1 Venn diagrams:
2.5.2 DEG/DMR overlapping Correlations:
Now we want to get the mean beta value for each sample in the dmr location: 1. Find all cpg that fall in the dmr 2. get the mean for each sample 3. Match each sample bVal with it’s expression value. (No way to do it, since there is no key sample to sample) 4. Make correlation 5. Plot.
ol <- dt_DEG_DMR[!(is.na(start)) &!(is.na(ID)),]library(data.table)
library(GenomicRanges)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
locs <- IlluminaHumanMethylationEPICanno.ilm10b2.hg19::Locations
locs.ranges <- GRanges(locs$chr, IRanges(locs$pos, locs$pos))
names(locs.ranges) <- rownames(locs)
dt_DEG_DMR_intersect<-lapply(1:NROW(ol),function(nr){
cont<-ol[nr,Contrast]
xpr <- counts[[cont]][ID==ol[nr,ID]]
xpr<-xpr[,.SD,.SDcols=names(xpr)[!(names(xpr)%in%c("ID","gene"))]]
expression <- suppressWarnings(melt(xpr,measure.vars=colnames(xpr),variable.name = "SID",value.name = "TPM" ))
SIDs <- as.character(expression$SID)
cgs <- names(subsetByOverlaps(locs.ranges,GRanges(ol[nr,])))
b<-betas[probeID %in% cgs ,lapply(.SD,function(x)mean(x,na.rm=T)),.SDcols=SIDs]
meth <- suppressWarnings( melt(b,measure.vars=colnames(b),variable.name = "SID",value.name = "DMR.meth" ))
# list(meth,expression)
m <- merge(meth,expression,by="SID")
m <- m[,lapply(.SD,as.numeric),by=SID]
dt<-cbind(ol[nr,],m)
cor<-cor.test(dt$DMR.meth,dt$TPM)
dt$correlation <- cor$estimate
dt$cor.pval <- cor$p.value
return(dt)
})|>rbindlist()
dt_DEG_DMR_intersect[,grp:=tstrsplit(SID,"_",keep = 1)]dt_cor <-dt_DEG_DMR_intersect
dt_cor$Contrast <- as.factor(dt_cor$Contrast)
dt_cor$SID <- as.factor(dt_cor$SID)
setorder(dt_cor,cor.pval,Gene)
saveRDS(dt_cor,"data/dt_cor.rds")2.5.2.1 Table for DMR & DEG:
In the following table the values for methylation and expression are integrated:
dtable(dt_cor[,.SD,.SDcols=c("Gene","Contrast","SID","DMR.meth","TPM","correlation","cor.pval")])2.5.3 Plots
library(ggplot2)
library(data.table)
library(ggpubr)
library(ggrepel)
DMR_DEG_plots <- list()
plots_folder <- "analysis/Integration_with_expression/DEGvsDMR/pval"
dir.create(plots_folder,recursive = T,showWarnings = F)
setkeyv(dt_cor,c("cor.pval","Gene","width"))
for(cont in unique(dt_cor$Contrast)){
dt<-head(dt_cor[Contrast==cont,],60)
lg <- dt[,.(gene=unique(Gene),width=unique(width)),by=cor.pval]
plist <- lapply(1:NROW(lg),function(nr){
pdata <- dt_cor[lg[nr,]]
plot_name <- paste0("/DEG_DMR.pval_", nr, ".png")
plt_path <- paste(plots_folder, cont, sep = .Platform$file.sep)
dir.create(plt_path,recursive = T,showWarnings = F)
p<-ggplot(data=pdata, aes(x=as.numeric(TPM),y=as.numeric(DMR.meth),colour=SID)) +
geom_point(aes(shape=grp),size=8,alpha=0.6) +
xlab("Mean DMR methylation") + ylab("Transcripts x Milion") +
geom_smooth(method = "lm",
inherit.aes = F,
aes(x=as.numeric(pdata$TPM),y=as.numeric(pdata$DMR.meth)),
linetype="dashed",
se=F,
formula = "y ~ x")+
annotate(geom = "text",
x = -Inf, y = Inf,
label=paste0("correlation: ", round(unique(pdata$correlation),4), ", p-value: ", round(unique(pdata$cor.pval),8)),
hjust = 0, vjust = 1
) +
ggtitle(paste0(" Correlation between expression and methylation for gene: ",unique(pdata$Gene)))
ggsave(path = plt_path, filename = plot_name, plot=p)
return(p)
})
DMR_DEG_plots[[cont]] <- plist
}
saveRDS(DMR_DEG_plots,"data/DMR_DEG_plots.rds")2.5.3.1 Tat_Tat72
DMR_DEG_plots[["Tat_Tat72"]]2.5.3.2 Tat_off
DMR_DEG_plots[["Tat_off"]]2.5.3.3 Tat72_off
DMR_DEG_plots[["Tat72_off"]]